﻿using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Linq;
using System.ServiceProcess;
using System.Text;
using Feng.SimpleCrawler;
using Feng.DbHelper;
using Feng.Log;
using HtmlAgilityPack;

namespace Feng.Demo
{
    /// <summary>
    /// windows服务
    /// </summary>
    partial class FengCnblogsService : ServiceBase
    {

        #region 构造函数
        /// <summary>
        /// 构造函数
        /// </summary>
        public FengCnblogsService()
        {
            InitializeComponent();
        } 
        #endregion

        #region 字段属性

        /// <summary>
        /// 蜘蛛爬虫的设置
        /// </summary>
        private static readonly CrawlSettings Settings = new CrawlSettings();

        /// <summary>
        /// 临时内存表存储数据
        /// </summary>
        private static DataTable dt = new DataTable();

        /// <summary>
        /// 关于 Filter URL：http://www.cnblogs.com/heaad/archive/2011/01/02/1924195.html
        /// </summary>
        private static BloomFilter<string> filter;

        #endregion

        #region 启动服务
        /// <summary>
        /// TODO: 在此处添加代码以启动服务。
        /// </summary>
        /// <param name="args"></param>
        protected override void OnStart(string[] args)
        {

            ProcessStart();
        } 
        #endregion

        #region 停止服务
        /// <summary>
        /// TODO: 在此处添加代码以执行停止服务所需的关闭操作。
        /// </summary>
        protected override void OnStop()
        {

        } 
        #endregion

        #region 程序开始处理
        /// <summary>
        /// 程序开始处理
        /// </summary>
        private void ProcessStart()
        {

            dt.Columns.Add("BlogTitle", typeof(string));
            dt.Columns.Add("BlogUrl", typeof(string));
            dt.Columns.Add("BlogAuthor", typeof(string));
            dt.Columns.Add("BlogTime", typeof(string));
            dt.Columns.Add("BlogMotto", typeof(string));
            dt.Columns.Add("BlogDepth", typeof(string));

            filter = new BloomFilter<string>(200000);
            const string CityName = "";
            #region 设置种子地址
            // 设置种子地址 
            Settings.SeedsAddress.Add(string.Format("http://www.cnblogs.com/{0}", CityName));

            Settings.SeedsAddress.Add("http://www.cnblogs.com/artech");
            Settings.SeedsAddress.Add("http://www.cnblogs.com/wuhuacong/");


            Settings.SeedsAddress.Add("http://www.cnblogs.com/dudu/");
            Settings.SeedsAddress.Add("http://www.cnblogs.com/guomingfeng/");

            Settings.SeedsAddress.Add("http://www.cnblogs.com/daxnet/");
            Settings.SeedsAddress.Add("http://www.cnblogs.com/fenglingyi");

            Settings.SeedsAddress.Add("http://www.cnblogs.com/ahthw/");
            Settings.SeedsAddress.Add("http://www.cnblogs.com/wangweimutou/");
            #endregion

            #region 设置 URL 关键字
            Settings.HrefKeywords.Add("a/");
            Settings.HrefKeywords.Add("b/");
            Settings.HrefKeywords.Add("c/");
            Settings.HrefKeywords.Add("d/");

            Settings.HrefKeywords.Add("e/");
            Settings.HrefKeywords.Add("f/");
            Settings.HrefKeywords.Add("g/");
            Settings.HrefKeywords.Add("h/");


            Settings.HrefKeywords.Add("i/");
            Settings.HrefKeywords.Add("j/");
            Settings.HrefKeywords.Add("k/");
            Settings.HrefKeywords.Add("l/");


            Settings.HrefKeywords.Add("m/");
            Settings.HrefKeywords.Add("n/");
            Settings.HrefKeywords.Add("o/");
            Settings.HrefKeywords.Add("p/");

            Settings.HrefKeywords.Add("q/");
            Settings.HrefKeywords.Add("r/");
            Settings.HrefKeywords.Add("s/");
            Settings.HrefKeywords.Add("t/");


            Settings.HrefKeywords.Add("u/");
            Settings.HrefKeywords.Add("v/");
            Settings.HrefKeywords.Add("w/");
            Settings.HrefKeywords.Add("x/");

            Settings.HrefKeywords.Add("y/");
            Settings.HrefKeywords.Add("z/");
            #endregion


            // 设置爬取线程个数
            Settings.ThreadCount = 1;

            // 设置爬取深度
            Settings.Depth = 55;

            // 设置爬取时忽略的 Link，通过后缀名的方式，可以添加多个
            Settings.EscapeLinks.Add("http://www.oschina.net/");

            // 设置自动限速，1~5 秒随机间隔的自动限速
            Settings.AutoSpeedLimit = false;

            // 设置都是锁定域名,去除二级域名后，判断域名是否相等，相等则认为是同一个站点
            Settings.LockHost = false;

            Settings.RegularFilterExpressions.Add(@"http://([w]{3}.)+[cnblogs]+.com/");
            var master = new CrawlMaster(Settings);
            master.AddUrlEvent += MasterAddUrlEvent;
            master.DataReceivedEvent += MasterDataReceivedEvent;

            master.Crawl();

        }
        
        #endregion

        #region 打印Url
        /// <summary>
        /// The master add url event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        /// <returns>
        /// The <see cref="bool"/>.
        /// </returns>
        private static bool MasterAddUrlEvent(AddUrlEventArgs args)
        {
            if (!filter.Contains(args.Url))
            {
                filter.Add(args.Url);
                Console.WriteLine(args.Url);

                if (dt.Rows.Count > 200)
                {
                    MssqlHelper.InsertDb(dt);
                    dt.Rows.Clear();
                }

                return true;
            }

            return false; // 返回 false 代表：不添加到队列中
        }
        #endregion

        #region 解析HTML
        /// <summary>
        /// The master data received event.
        /// </summary>
        /// <param name="args">
        /// The args.
        /// </param>
        private static void MasterDataReceivedEvent(SimpleCrawler.DataReceivedEventArgs args)
        {
            // 在此处解析页面，可以用类似于 HtmlAgilityPack（页面解析组件）的东东、也可以用正则表达式、还可以自己进行字符串分析

            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(args.Html);

            HtmlNode node = doc.DocumentNode.SelectSingleNode("//title");
            string title = node.InnerText;

            HtmlNode node2 = doc.DocumentNode.SelectSingleNode("//*[@id='post-date']");
            string time = node2.InnerText;

            HtmlNode node3 = doc.DocumentNode.SelectSingleNode("//*[@id='topics']/div/div[3]/a[1]");
            string author = node3.InnerText;

            HtmlNode node6 = doc.DocumentNode.SelectSingleNode("//*[@id='blogTitle']/h2");
            string motto = node6.InnerText;

            MssqlHelper.GetData(title, args.Url, author, time, motto, args.Depth.ToString(), dt);

            LogHelper.WriteLog(title);
            LogHelper.WriteLog(args.Url);
            LogHelper.WriteLog(author);
            LogHelper.WriteLog(time);
            LogHelper.WriteLog(motto == "" ? "null" : motto);
            LogHelper.WriteLog(args.Depth + "&dt.Rows.Count=" + dt.Rows.Count);

            //每次超过100条数据就存入数据库，可以根据自己的情况设置数量
            if (dt.Rows.Count > 100)
            {
                MssqlHelper.InsertDb(dt);
                dt.Rows.Clear();
            }

        }
        #endregion


    }
}
